sdr_data <- read.csv(here("data/SDR-2023-Data.csv"))
sdr_data <- sdr_data %>% 
  clean_names()
unique_colors <- c("green" = "darkseagreen", "orange" = "darkorange1", "red" = "coral2", "yellow" = "darkgoldenrod2")

goal_7_faceted_bar_plot <- ggplot(data = sdr_data, aes(x = goal_7_score, 
                            y = reorder(country, goal_7_score),
                            fill = goal_7_dash)) +
  geom_col(stat = "identity") +
  facet_wrap(~regions_used_for_the_sdr, scales = "free_y") +
  scale_fill_manual(values = unique_colors) +  # Specify manual fill scale
  theme_minimal() +
  theme(axis.text.y = element_text(size = 4)) +
  labs(x = "SDG 7 Score",
       y = "")
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`
ggplotly(goal_7_faceted_bar_plot)
## Warning: Removed 27 rows containing missing values (`position_stack()`).
ggplot(sdr_data, aes(x = goal_7_score, 
                     y = goal_1_score)) +
  theme_minimal() +
  geom_point() +
  geom_smooth() +
  stat_cor()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 42 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 42 rows containing non-finite values (`stat_cor()`).
## Warning: Removed 42 rows containing missing values (`geom_point()`).

goal_7_and_9_scatter_plot <- ggplot(sdr_data, aes(x = goal_7_score, 
                     y = goal_1_score,
                     color = regions_used_for_the_sdr,
                     label = country)) +
  theme_minimal() +
  geom_point() +
  scale_color_brewer(palette = "Set3")

ggplotly(goal_7_and_9_scatter_plot)
world <- ne_countries(scale = "medium", returnclass = "sf")
world <- world %>% 
  select(name_long, iso_a3, geometry)
# Rename a column in a data frame or matrix
colnames(sdr_data)[which(colnames(sdr_data) == "country_code_iso3")] <- "iso_a3"
joined_df <- left_join(sdr_data, world, by = "iso_a3")
world_df_joined <- st_as_sf(joined_df)
world_df_joined <- st_transform(world_df_joined, "+proj=longlat +datum=WGS84")
mytext <- paste(
    "Country: ", world_df_joined$country,"<br/>", 
    "Goal 7 Score: ", round(world_df_joined$goal_7_score, 2), 
    sep="") %>%
  lapply(htmltools::HTML)

leaflet(world_df_joined) %>% 
  addTiles()  %>% 
  setView( lat=10, lng=0 , zoom=2) %>%
  addPolygons(stroke = FALSE, fillOpacity = 0.5, smoothFactor = 0.5, color = ~colorQuantile("YlOrRd", goal_7_score)(goal_7_score), label = mytext)

Correlation matrix

sdr_scores <- sdr_data %>%
  select(
    goal_1_score, goal_2_score, goal_3_score, goal_4_score, goal_5_score,
    goal_6_score, goal_7_score, goal_8_score, goal_9_score, goal_10_score,
    goal_11_score, goal_12_score, goal_13_score, goal_14_score, goal_15_score,
    goal_16_score, goal_17_score
  )
sdr_scores_matrix <- as.matrix(sdr_scores)
cor <- cor(sdr_scores_matrix, use = "complete.obs")
ggcorrplot::ggcorrplot(cor, method = "circle", type = "lower", lab = TRUE)

goal 7 tile

goal_seven_data <- sdr_data %>% 
  select(country, regions_used_for_the_sdr, normalized_score_sdg7_elecac, normalized_score_sdg7_cleanfuel, normalized_score_sdg7_co2twh, normalized_score_sdg7_renewcon, goal_7_dash, goal_7_trend)
# Melt the data for easier plotting
# Reshape the data using pivot_longer
melted_data <- pivot_longer(goal_seven_data, cols = starts_with("normalized_score_sdg7"), 
                            names_to = "variable", values_to = "value")

# Plotting heatmap using geom_tile
ggplot(melted_data, aes(x = variable, y = country, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_viridis_c() +
  facet_wrap(~regions_used_for_the_sdr, scales = "free_y") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 5),
        axis.text.y = element_text(size = 5)) +
  labs(x = "", y = "", fill = "Score") 

I want dash and trend also on the x axis

ggplot(sdr_data, aes(x = goal_7_score, fill = regions_used_for_the_sdr)) +
  geom_histogram(color = "black") +
  theme_minimal() +
  scale_fill_viridis_d()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 27 rows containing non-finite values (`stat_bin()`).

Missing

gg_miss_var(sdr_scores)

ggplot(sdr_data, aes(x = goal_7_score, 
                     y = goal_1_score,
                     color = regions_used_for_the_sdr,
                     label = country)) +
  theme_minimal() +
  geom_miss_point() +
  scale_color_brewer(palette = "Set3")

Cleaning and Imputation for ML - Clustering and Random Forest Regression/Classification

sdr_data_normalized_scores <- sdr_data %>% 
  select(country, contains("normalized_score"))
gg_miss_var(sdr_data_normalized_scores, show_pct = TRUE)

sdr_data_normalized_scores_less_na <- sdr_data_normalized_scores %>%
  select(where(~ sum(is.na(.))/length(.) <= 0.3))
sdr_data_imputed <- missRanger(sdr_data_normalized_scores_less_na)
## 
## Missing value imputation by random forests
## 
##   Variables to impute:       normalized_score_sdg1_wpc, normalized_score_sdg1_lmicpov, normalized_score_sdg2_undernsh, normalized_score_sdg2_stunting, normalized_score_sdg2_wasting, normalized_score_sdg2_obesity, normalized_score_sdg2_trophic, normalized_score_sdg2_crlyld, normalized_score_sdg2_snmi, normalized_score_sdg3_matmort, normalized_score_sdg3_neonat, normalized_score_sdg3_u5mort, normalized_score_sdg3_tb, normalized_score_sdg3_ncds, normalized_score_sdg3_pollmort, normalized_score_sdg3_traffic, normalized_score_sdg3_lifee, normalized_score_sdg3_fertility, normalized_score_sdg3_births, normalized_score_sdg3_vac, normalized_score_sdg3_uhc, normalized_score_sdg3_swb, normalized_score_sdg4_earlyedu, normalized_score_sdg4_primary, normalized_score_sdg4_second, normalized_score_sdg4_literacy, normalized_score_sdg5_familypl, normalized_score_sdg5_edat, normalized_score_sdg5_lfpr, normalized_score_sdg5_parl, normalized_score_sdg6_water, normalized_score_sdg6_sanita, normalized_score_sdg6_freshwat, normalized_score_sdg6_wastewat, normalized_score_sdg6_scarcew, normalized_score_sdg7_elecac, normalized_score_sdg7_cleanfuel, normalized_score_sdg7_co2twh, normalized_score_sdg7_renewcon, normalized_score_sdg8_adjgrowth, normalized_score_sdg8_slavery, normalized_score_sdg8_accounts, normalized_score_sdg8_unemp, normalized_score_sdg8_impacc, normalized_score_sdg8_impslav, normalized_score_sdg9_roads, normalized_score_sdg9_intuse, normalized_score_sdg9_mobuse, normalized_score_sdg9_lpi, normalized_score_sdg9_uni, normalized_score_sdg9_articles, normalized_score_sdg9_rdex, normalized_score_sdg10_gini, normalized_score_sdg10_palma, normalized_score_sdg11_slums, normalized_score_sdg11_pm25, normalized_score_sdg11_pipedwat, normalized_score_sdg11_transport, normalized_score_sdg12_msw, normalized_score_sdg12_ewaste, normalized_score_sdg12_so2prod, normalized_score_sdg12_so2import, normalized_score_sdg12_nprod, normalized_score_sdg12_nimport, normalized_score_sdg12_explastic, normalized_score_sdg13_co2gcp, normalized_score_sdg13_co2import, normalized_score_sdg13_co2export, normalized_score_sdg14_biomar, normalized_score_sdg15_cpta, normalized_score_sdg15_cpfa, normalized_score_sdg15_redlist, normalized_score_sdg15_forchg, normalized_score_sdg15_biofrwter, normalized_score_sdg16_homicides, normalized_score_sdg16_detain, normalized_score_sdg16_safe, normalized_score_sdg16_u5reg, normalized_score_sdg16_cpi, normalized_score_sdg16_weaponsexp, normalized_score_sdg16_rsf, normalized_score_sdg17_govex, normalized_score_sdg17_cohaven, normalized_score_sdg17_statperf
##   Variables used to impute:  country, normalized_score_sdg1_wpc, normalized_score_sdg1_lmicpov, normalized_score_sdg2_undernsh, normalized_score_sdg2_stunting, normalized_score_sdg2_wasting, normalized_score_sdg2_obesity, normalized_score_sdg2_trophic, normalized_score_sdg2_crlyld, normalized_score_sdg2_snmi, normalized_score_sdg3_matmort, normalized_score_sdg3_neonat, normalized_score_sdg3_u5mort, normalized_score_sdg3_tb, normalized_score_sdg3_ncds, normalized_score_sdg3_pollmort, normalized_score_sdg3_traffic, normalized_score_sdg3_lifee, normalized_score_sdg3_fertility, normalized_score_sdg3_births, normalized_score_sdg3_vac, normalized_score_sdg3_uhc, normalized_score_sdg3_swb, normalized_score_sdg4_earlyedu, normalized_score_sdg4_primary, normalized_score_sdg4_second, normalized_score_sdg4_literacy, normalized_score_sdg5_familypl, normalized_score_sdg5_edat, normalized_score_sdg5_lfpr, normalized_score_sdg5_parl, normalized_score_sdg6_water, normalized_score_sdg6_sanita, normalized_score_sdg6_freshwat, normalized_score_sdg6_wastewat, normalized_score_sdg6_scarcew, normalized_score_sdg7_elecac, normalized_score_sdg7_cleanfuel, normalized_score_sdg7_co2twh, normalized_score_sdg7_renewcon, normalized_score_sdg8_adjgrowth, normalized_score_sdg8_slavery, normalized_score_sdg8_accounts, normalized_score_sdg8_unemp, normalized_score_sdg8_impacc, normalized_score_sdg8_impslav, normalized_score_sdg9_roads, normalized_score_sdg9_intuse, normalized_score_sdg9_mobuse, normalized_score_sdg9_lpi, normalized_score_sdg9_uni, normalized_score_sdg9_articles, normalized_score_sdg9_rdex, normalized_score_sdg10_gini, normalized_score_sdg10_palma, normalized_score_sdg11_slums, normalized_score_sdg11_pm25, normalized_score_sdg11_pipedwat, normalized_score_sdg11_transport, normalized_score_sdg12_msw, normalized_score_sdg12_ewaste, normalized_score_sdg12_so2prod, normalized_score_sdg12_so2import, normalized_score_sdg12_nprod, normalized_score_sdg12_nimport, normalized_score_sdg12_explastic, normalized_score_sdg13_co2gcp, normalized_score_sdg13_co2import, normalized_score_sdg13_co2export, normalized_score_sdg14_biomar, normalized_score_sdg15_cpta, normalized_score_sdg15_cpfa, normalized_score_sdg15_redlist, normalized_score_sdg15_forchg, normalized_score_sdg15_biofrwter, normalized_score_sdg16_homicides, normalized_score_sdg16_detain, normalized_score_sdg16_safe, normalized_score_sdg16_u5reg, normalized_score_sdg16_cpi, normalized_score_sdg16_weaponsexp, normalized_score_sdg16_rsf, normalized_score_sdg17_govex, normalized_score_sdg17_cohaven, normalized_score_sdg17_statperf
## iter 1:  ....................................................................................
## iter 2:  ....................................................................................
## iter 3:  ....................................................................................
## iter 4:  ....................................................................................
## iter 5:  ....................................................................................

Cluster

sdr_data_imputed <- sdr_data_imputed %>% 
  remove_rownames %>% 
  column_to_rownames(var="country")
fviz_nbclust(sdr_data_imputed, kmeans, method = "silhouette") 

k2 <- kmeans(sdr_data_imputed, centers = 2)
fviz_cluster(k2, data = sdr_data_imputed) +
  theme_minimal() 

Find key drivers of clustering (HI Appleseed Analysis)

Random Forest

rf_matmort <- randomForest(normalized_score_sdg3_matmort ~ .,
                             data = sdr_data_imputed,
                             importance = TRUE)
rf_matmort
## 
## Call:
##  randomForest(formula = normalized_score_sdg3_matmort ~ ., data = sdr_data_imputed,      importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 27
## 
##           Mean of squared residuals: 85.90416
##                     % Var explained: 83.28
importance_df <- as.data.frame(rf_matmort$importance)
importance_df_top_10 <- importance_df %>%
  rownames_to_column(var = "variable") %>% 
  slice_max(n = 10, order_by = `%IncMSE`)
ggplot(importance_df_top_10, aes(x = `%IncMSE`, y = reorder(variable, `%IncMSE`))) +
  geom_bar(stat = "identity", fill = "steelblue", color = "black") +
  theme_minimal()

Partial Dependence Plots

Week 2 Day 1 - EDA - getting to know your data, troubleshooting Week 2 Day 2 - Bar chart, basic viz, Maps, Histogram, Bubble Plots

Week 3 Day 1 - Scatterplot w/line, correlation matrices Week 3 Day 2 - Machine Learning, imputing data, clustering, random forest